modified from http://cs.stanford.edu/people/karpathy/cs231nfiles/minimal_net.html

In [276]:
# A bit of setup
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'spectral'

# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload

In [938]:
np.random.seed(0)
N = 500 # number of points per class
D = 2 # dimensionality
K = 3 # number of classes
X = np.zeros((N*K,D))
y = np.zeros(N*K, dtype='uint8')
for j in xrange(K):
  ix = range(N*j,N*(j+1))
  r = np.linspace(0.05,1,N) # radius
  t = np.linspace(j*(K+1),(j+1)*(K+1),N) + np.random.randn(N)*0.2 # theta
  X[ix] = np.c_[r*np.sin(t), r*np.cos(t)]
  y[ix] = j
fig = plt.figure()
plt.scatter(X[:, 0], X[:, 1], c=y, s=40,linewidths=0)#, cmap=plt.cm.Spectral)
plt.xlim([-r[-1]*1.1,r[-1]*1.1])
plt.ylim([-r[-1]*1.1,r[-1]*1.1])
#fig.savefig('spiral_raw.png')
Out[938]:
(-1.1000000000000001, 1.1000000000000001)
In [939]:
#Train a Linear Classifier

# initialize parameters randomly
W = 0.01 * np.random.randn(D,K)
b = np.zeros((1,K))

# some hyperparameters
step_size = 1e-0
reg = 1e-3 # regularization strength

# gradient descent loop
num_examples = X.shape[0]
for i in xrange(200):
  
  # evaluate class scores, [N x K]
  scores = np.dot(X, W) + b 
  
  # compute the class probabilities
  exp_scores = np.exp(scores)
  probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True) # [N x K]
  
  # compute the loss: average cross-entropy loss and regularization
  corect_logprobs = -np.log(probs[range(num_examples),y])
  data_loss = np.sum(corect_logprobs)/num_examples
  reg_loss = 0.5*reg*np.sum(W*W)
  loss = data_loss + reg_loss
  if i % 10 == 0:
    print "iteration %d: loss %f" % (i, loss)
  
  # compute the gradient on scores
  dscores = probs
  dscores[range(num_examples),y] -= 1
  dscores /= num_examples
  
  # backpropate the gradient to the parameters (W,b)
  dW = np.dot(X.T, dscores)
  db = np.sum(dscores, axis=0, keepdims=True)
  
  dW += reg*W # regularization gradient
  
  # perform a parameter update
  W += -step_size * dW
  b += -step_size * db
iteration 0: loss 1.099513
iteration 10: loss 0.909884
iteration 20: loss 0.842730
iteration 30: loss 0.813568
iteration 40: loss 0.799051
iteration 50: loss 0.791149
iteration 60: loss 0.786576
iteration 70: loss 0.783811
iteration 80: loss 0.782084
iteration 90: loss 0.780979
iteration 100: loss 0.780257
iteration 110: loss 0.779778
iteration 120: loss 0.779457
iteration 130: loss 0.779240
iteration 140: loss 0.779091
iteration 150: loss 0.778989
iteration 160: loss 0.778918
iteration 170: loss 0.778869
iteration 180: loss 0.778835
iteration 190: loss 0.778811

In [940]:
# evaluate training set accuracy
scores = np.dot(X, W) + b
predicted_class = np.argmax(scores, axis=1)
print 'training accuracy: %.2f' % (np.mean(predicted_class == y))
training accuracy: 0.52

In [1924]:
In [942]:
sigmoid = lambda x : 1.0 / (1.0 + np.exp(-x))
tanh = lambda x: np.tanh(x)
In [943]:
import numpy.random as npr
In [1097]:
# plot the resulting classifier
h = 0.02
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                     np.arange(y_min, y_max, h))
Z = np.dot(np.c_[xx.ravel(), yy.ravel()], W) + b
Z = np.argmax(Z, axis=1)
Z = Z.reshape(xx.shape)
fig = plt.figure()
plt.contourf(xx, yy, Z, alpha=0.8)
plt.scatter(X[:, 0], X[:, 1], c=y, s=40)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
#fig.savefig('spiral_linear.png')
Out[1097]:
30
In [1098]:
random_starts = []
for i in range(1000):

    h = 30 # size of hidden layer
    W = 0.01 * np.random.randn(D,h)
    b = np.zeros((1,h))
    W2 = 0.01 * np.random.randn(h,K)
    b2 = np.zeros((1,K))

    random_starts.append(mats_to_vec(W,b,W2,b2))
random_starts = np.array(random_starts)
In [1099]:
pdist = distance.pdist(random_starts)

dist_mat = distance.squareform(pdist)

plt.imshow(dist_mat,interpolation='none',
           #norm=ln,
           cmap ='Blues')
#num_ticks = 4
#plt.xticks(np.arange(4)*num_iter/num_ticks)
plt.colorbar()
plt.title('Pairwise distances between random parameter initializations')
plt.savefig('l2-pairwise-param-vec-distances-30h-random.jpg',dpi=600)
In []:
In [1104]:
def train_net(h=30,D=D,K=K,num_iter=10000):
    
    # initialize parameters randomly
    #h = 30 # size of hidden layer
    W = 0.01 * np.random.randn(D,h)
    b = np.zeros((1,h))
    W2 = 0.01 * np.random.randn(h,K)
    b2 = np.zeros((1,K))
    
    # some hyperparameters
    step_size = 1e-0
    reg = 1e-3 # regularization strength
    
    
    # keep a record of training progress
    params = np.hstack((np.reshape(W,W.shape[0]*W.shape[1]),np.reshape(b,b.shape[0]*b.shape[1]),
                        np.reshape(W2,W2.shape[0]*W2.shape[1]),np.reshape(b2,b2.shape[0]*b2.shape[1])))
    num_params = len(params)
    
    run_record = np.zeros((num_iter,num_params))
    loss_record = np.zeros(num_iter)
    
    # stochastic gradient descent loop
    #mini_batch_size = len(X)
    mini_batch_size=100
    for i in xrange(num_iter):
      # minibatch
      if mini_batch_size < len(X):
        mask = npr.rand(len(X)) < (1.0*mini_batch_size / len(X))
        X_ = X[mask]
        y_ = y[mask]
      else:
        X_ = X[:]
        y_ = y[:]
      num_examples = len(X)#X_.shape[0]
      # evaluate class scores, [N x K]
      #hidden_layer = tanh(np.dot(X, W) + b)
      #hidden_layer = sigmoid(np.dot(X, W) + b)
      hidden_layer = np.maximum(0, np.dot(X, W) + b) # note, ReLU activation
      scores = np.dot(hidden_layer, W2) + b2
      
      # compute the class probabilities
      exp_scores = np.exp(scores)
      probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True) # [N x K]
      
      # compute the loss: average cross-entropy loss and regularization
      
      # compute full loss:
      compute_full_loss=False
      if mini_batch_size == len(X) or compute_full_loss:
        corect_logprobs = -np.log(probs[range(len(X)),y])
        data_loss = np.sum(corect_logprobs)/num_examples
        reg_loss = 0.5*reg*np.sum(W*W) + 0.5*reg*np.sum(W2*W2)
        loss = data_loss + reg_loss
        if i % 1000 == 0:
          print "iteration %d: loss %f" % (i, loss)
        loss_record[i] = loss
        # compute the gradient on scores
        dscores = probs
        dscores[range(num_examples),y] -= 1
        dscores /= num_examples
      
      # compute minibatch loss for gradient purposes:
      if mini_batch_size < len(X):
        num_examples = X_.shape[0]
        hidden_layer = np.maximum(0, np.dot(X_, W) + b) # note, ReLU activation
        scores = np.dot(hidden_layer, W2) + b2
        
        # compute the class probabilities
        exp_scores = np.exp(scores)
        probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True) # [N x K]
        corect_logprobs = -np.log(probs[range(num_examples),y_])
        data_loss = np.sum(corect_logprobs)/num_examples
        reg_loss = 0.5*reg*np.sum(W*W) + 0.5*reg*np.sum(W2*W2)
        loss = data_loss + reg_loss
        # compute the gradient on scores
        dscores = probs
        dscores[range(num_examples),y_] -= 1
        dscores /= num_examples
      
      # backpropate the gradient to the parameters
      # first backprop into parameters W2 and b2
      dW2 = np.dot(hidden_layer.T, dscores)
      db2 = np.sum(dscores, axis=0, keepdims=True)
      # next backprop into hidden layer
      dhidden = np.dot(dscores, W2.T)
      # backprop the ReLU non-linearity
      dhidden[hidden_layer <= 0] = 0
      # finally into W,b
      dW = np.dot(X_.T, dhidden)
      db = np.sum(dhidden, axis=0, keepdims=True)
      
      # add regularization gradient contribution
      dW2 += reg * W2
      dW += reg * W
      
      # perform a parameter update
      W += -step_size * dW
      b += -step_size * db
      W2 += -step_size * dW2
      b2 += -step_size * db2
        
      run_record[i] = np.hstack((np.reshape(W,W.shape[0]*W.shape[1]),np.reshape(b,b.shape[0]*b.shape[1]),
                                   np.reshape(W2,W2.shape[0]*W2.shape[1]),np.reshape(b2,b2.shape[0]*b2.shape[1])))
    return run_record
In [1105]:
run_record1 = train_net()
In []:
run_records = []
In [1149]:
# how far away are the endpoints of multiple runs?
from time import time
t = time()

for i in range(80):
    run_records.append(train_net())
    print(i)
    print(time() - t)
0
7.45791292191
1
15.3475661278
2
23.1817469597
3
31.306440115
4
39.042247057
5
46.9533650875
6
54.6188650131
7
62.3704459667
8
70.2733590603
9
78.6233141422
10
87.043815136
11
94.4979491234
12
102.062829018
13
109.494400978
14
116.972666979
15
124.40716815
16
132.65359807
17
141.047703981
18
149.042865992
19
156.998424053
20
165.14190793
21
172.733784914
22
180.139306068
23
187.649156094
24
195.534194946
25
203.416218996
26
211.032681942
27
218.112601995
28
225.832948923
29
233.665717125
30
241.779407024
31
249.546333075
32
257.763616085
33
266.038578033
34
274.112976074
35
282.159974098
36
289.701508045
37
297.831735134
38
306.056132078
39
313.52570796
40
321.126660109
41
328.296725988
42
335.622035027
43
342.927256107
44
351.254452944
45
358.987678051
46
366.603663921
47
376.284378052
48
387.776755095
49
406.616961002
50
423.456924915
51
454.840601921
52
464.909745932
53
473.106658936
54
480.417410135
55
487.916582108
56
496.06209898
57
504.331903934
58
512.220806122
59
519.705235004
60
527.545475006
61
535.668530941
62
543.067381144
63
550.179274082
64
558.124238014
65
565.953258038
66
573.270612955
67
580.614607096
68
587.817709923
69
595.746259928
70
604.204235077
71
612.519675016
72
619.838376999
73
627.332477093
74
634.965758085
75
642.629504919
76
649.679773092
77
657.667140961
78
666.531707048
79
674.039299965

In [1150]:
start_points = np.array([run_record[0] for run_record in run_records])
pdist = distance.pdist(start_points)
max(pdist),min(pdist)
Out[1150]:
(0.37691489582619869, 0.14943557207906349)
In [1155]:
def pairwise_distance_over_time(run_records):
    n = len(run_records[0])
    
    current_points = np.array([run_record[0] for run_record in run_records])
    pdist = distance.pdist(current_points)
    num_dist = len(pdist)
    distances = np.zeros((n,num_dist))
    for i in range(n):
        current_points = np.array([run_record[i] for run_record in run_records])
        pdist = distance.pdist(current_points)
        distances[i] = pdist
    return distances

def plot_pairwise(run_records):
    distances = pairwise_distance_over_time(run_records)
    plt.plot(np.mean(distances,1))
    
    for i in range(distances.shape[1]):
        plt.scatter(range(distances.shape[0]),distances[:,i],
                   linewidths=0,alpha=0.5)
In [1156]:
distances = pairwise_distance_over_time(run_records)
In [1160]:
plt.plot(np.mean(distances,1))
Out[1160]:
[<matplotlib.lines.Line2D at 0x4752e5350>]
In [1164]:
distances.shape
Out[1164]:
(10000, 4950)
In [1165]:
#lin_dist = distances.reshape(np.prod(distances.shape))
In [1170]:
lin_dist=np.zeros(np.prod(distances.shape))
for i in range(len(distances.T)):
    lin_dist[i*distances.shape[0]:(i+1)*distances.shape[0]] = distances[:,i]
In [1166]:
iter_num = np.hstack((range(len(distances)) for _ in range(distances.shape[1])))
In [1173]:
plt.scatter(iter_num[::100],lin_dist[::100],
            linewidth=0)
Out[1173]:
<matplotlib.collections.PathCollection at 0x2e2323850>
In [1201]:
plt.scatter(iter_num[iter_num<2000][::15],lin_dist[iter_num<2000][::15],
            linewidth=0,alpha=0.005);
In [1188]:
dist_vecs = [distance.pdist(run[::10]) for run in run_records]
In [1191]:
np.array(dist_vecs).shape
Out[1191]:
(100, 499500)
In [1192]:
avg_dist_vec = np.mean(np.array(dist_vecs),0)
In [1193]:
avg_pdist = distance.squareform(avg_dist_vec)
In [1238]:
for i,dist_vec in enumerate(dist_vecs):
    plt.imshow(distance.squareform(dist_vec),
               interpolation='none',
               cmap='Blues')
    plt.savefig('distmat {0}.jpg'.format(i),dpi=600) 

which examples are most typical and which examples are least typical?

In []:
In [1194]:
plt.imshow(avg_pdist,interpolation='none',
           #norm=ln,
           cmap ='Blues')
#num_ticks = 4
#plt.xticks(np.arange(4)*num_iter/num_ticks)
plt.colorbar()
Out[1194]:
<matplotlib.colorbar.Colorbar instance at 0x512598dd0>
In [1195]:
std_dist_vec = np.std(np.array(dist_vecs),0)
In [1196]:
std_pdist = distance.squareform(std_dist_vec)
In [1197]:
plt.imshow(std_pdist,interpolation='none',
           #norm=ln,
           cmap ='Blues')
#num_ticks = 4
#plt.xticks(np.arange(4)*num_iter/num_ticks)
plt.colorbar()
Out[1197]:
<matplotlib.colorbar.Colorbar instance at 0x50e6634d0>
In [1207]:
plt.hist2d(iter_num[iter_num<2000],lin_dist[iter_num<2000],bins=50,
           cmap='Blues');
plt.colorbar()
Out[1207]:
<matplotlib.colorbar.Colorbar instance at 0x512412290>
In []:
for i in range(distances[:,:100].shape[1]):
    plt.scatter(range(distances.shape[0]),distances[:,i],
               linewidths=0,alpha=0.5)
In [1151]:
end_points = np.array([run_record[-1] for run_record in run_records])
In [1152]:
end_points.shape
Out[1152]:
(100, 183)
In [1153]:
pdist = distance.pdist(end_points)
max(pdist),min(pdist)
Out[1153]:
(31.178205827282927, 8.6517482027073012)
In [1217]:
run_records_ = np.vstack(run_records)
run_records_.shape
Out[1217]:
(1000000, 183)
In [1147]:
run_records_[::50].shape
Out[1147]:
(3900, 183)
In [1221]:
all_dist = distance.pdist(run_records_[npr.rand(len(run_records_))<0.001])
In [1222]:
plt.hist(all_dist,bins=100);
In [1225]:
pca = PCA()
pca.fit(run_records_[::10])
Out[1225]:
PCA(copy=True, n_components=None, whiten=False)
In [1226]:
X_ = pca.transform(run_records_)
In [1227]:
pca.explained_variance_ratio_[:2]
Out[1227]:
array([ 0.04538429,  0.03710254])
In [1228]:
c = np.hstack([i*np.ones(len(run_records[0])) for i in range(len(run_records))])
c.shape
Out[1228]:
(1000000,)
In [1229]:
c[::thinning].shape
Out[1229]:
(20000,)
In [1237]:
thinning=50
plt.scatter(X_[::thinning,0],X_[::thinning,1],
            c=c[::thinning],
            alpha=0.5,
            linewidths=0)
plt.xlabel('PC1')
plt.ylabel('PC2')
Out[1237]:
<matplotlib.text.Text at 0x54d7a1a10>
In [1236]:
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

ax.scatter(X_[::thinning,0],X_[::thinning,1],X_[::thinning,2],
           alpha=0.5,c=c[::thinning],linewidths=0)
Out[1236]:
<mpl_toolkits.mplot3d.art3d.Path3DCollection at 0x54aa09550>
In [962]:
run_record.shape
Out[962]:
(10000, 183)
In [946]:
D,h,K
Out[946]:
(2, 3, 3)
In [947]:
l1 = lambda x,y : sum(abs(x-y))
In [948]:
l2 = lambda x,y : np.sqrt(sum((x-y)**2))
In [949]:
norm = l2
In [963]:
distance_from_start = np.array([norm(run_record[i],run_record[0]) for i in xrange(len(run_record))])
In [964]:
plt.rcParams['font.family'] = 'serif'
In [965]:
plt.plot(distance_from_start)
plt.hlines(max(distance_from_start),0,len(distance_from_start),linestyle='--')
plt.title('Distance from initial parameter configuration')
plt.xlabel('Iteration')
plt.ylabel('Euclidean distance')
Out[965]:
<matplotlib.text.Text at 0x166f2c2d0>
In [999]:
best_index = np.argmin(loss_record)
distance_from_best = np.array([norm(run_record[i],run_record[best_index]) for i in xrange(len(run_record))])
best_index
Out[999]:
0
In [760]:
plt.plot(distance_from_best)
plt.vlines(best_index,0,max(distance_from_best),linestyle='--')

plt.title('Distance from best parameter configuration')
plt.xlabel('Iteration')
plt.ylabel('Euclidean distance')
Out[760]:
<matplotlib.text.Text at 0x161df5150>
In [761]:
plt.plot(loss_record)
plt.title('Loss')
plt.xlabel('Iteration')
plt.ylabel('Loss')
plt.ylim(0,max(loss_record))
plt.vlines(best_index,0,max(loss_record),linestyle='--')
Out[761]:
<matplotlib.collections.LineCollection at 0x163334810>
In [762]:
worst_index = np.argmax(loss_record)
distance_from_worst = np.array([norm(run_record[i],run_record[worst_index]) for i in xrange(len(run_record))])
worst_index
Out[762]:
9412
In [763]:
plt.plot(distance_from_worst)
plt.vlines(worst_index,0,max(distance_from_worst),linestyle='--')

plt.title('Distance from worst parameter configuration')
plt.xlabel('Iteration')
plt.ylabel('Euclidean distance')
Out[763]:
<matplotlib.text.Text at 0x16330a810>
In [1001]:
# evaluate training set accuracy
hidden_layer = np.maximum(0, np.dot(X, W) + b)
scores = np.dot(hidden_layer, W2) + b2
predicted_class = np.argmax(scores, axis=1)
print 'training accuracy: %.2f' % (np.mean(predicted_class == y))
training accuracy: 0.98

In [1240]:
def accuracy(vec,X=X,y=y,D=D,h=h,K=K):
    W,b,W2,b2 = vec_to_mats(vec,D,h,K)
    hidden_layer = np.maximum(0, np.dot(X, W) + b)
    scores = np.dot(hidden_layer, W2) + b2
    predicted_class = np.argmax(scores, axis=1)
    return (np.mean(predicted_class == y))
In [1332]:
def loss(vec,X=X,y=y,D=D,h=h,K=K):
    W,b,W2,b2 = vec_to_mats(vec,D,h,K)
    hidden_layer = np.maximum(0, np.dot(X, W) + b)
    scores = np.dot(hidden_layer, W2) + b2
    num_examples = len(X)
    # compute the class probabilities
    exp_scores = np.exp(scores)
    probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True) # [N x K]
    corect_logprobs = -np.log(probs[range(num_examples),y])
    data_loss = np.sum(corect_logprobs)/num_examples
    reg_loss=0
    #reg_loss = 0.5*reg*np.sum(W*W) + 0.5*reg*np.sum(W2*W2)
    return data_loss + reg_loss
In [1255]:
D
Out[1255]:
2
In [1256]:
run_records[0][0].shape
Out[1256]:
(183,)
In [1258]:
%timeit loss(run_records[0][0])
1000 loops, best of 3: 686 µs per loop

In [1260]:
interp = lambda initial,final,alpha=0.5: (1-alpha)*initial + alpha*final
In [1372]:
dim = len(end_points[0])
In [1373]:
alpha= np.arange(-0.2,1.5,0.05)
inds = range(len(start_points))
In [1374]:
line_losses = [[loss(interp(start_points[i],end_points[i],a)) for a in alpha] for i in inds]
In [1375]:
line_losses = [[loss(interp(np.zeros(dim),end_points[i],a)) for a in alpha] for i in inds]
In [1604]:
line_losses = [[loss(interp(np.zeros(dim),run_records[i][-1],a)) for a in alpha] for i in inds]
In [1605]:
line_losses = np.array(line_losses)
line_losses.shape
Out[1605]:
(100, 34)
In [1606]:
np.min(line_losses)
Out[1606]:
0.027062382424285387
In [1607]:
plt.plot(line_losses[0]);
In [1608]:
best_solution = min([loss(e) for e in end_points])
best_solution
Out[1608]:
0.077140585108297591
In [1615]:
[plt.plot(alpha,line_losses[i],alpha=0.3) for i in range(100)];
plt.hlines(best_solution,alpha[0],alpha[-1],linestyle='--',color='grey')
plt.vlines(1,0,np.max(line_losses),linestyle='--',color='grey')
plt.xlabel('alpha')
plt.ylabel('loss')
Out[1615]:
<matplotlib.text.Text at 0x53c473910>
In [1370]:
np.mean(abs(end_points))
Out[1370]:
0.78300038541779071
In [1371]:
np.mean(abs(npr.randn(1000)))
Out[1371]:
0.80506696282332135

hmm so what does this look like if you pick a random direction? or take one good network and scramble its weight vector? or interpolate between two good networks?

In [1895]:
rand_end_points = npr.randn(10000,dim)
In [1918]:
line_losses = [[loss(interp(np.zeros(dim),rand_end_points[i],alpha)) for alpha in np.arange(-1,1,0.1)] for i in range(10000)]
In [1904]:
[plt.plot(line) for line in line_losses[:10]];
In [1921]:
np.min(line_losses),np.max(line_losses)
Out[1921]:
(0.78421095702561372, 19.994141412327814)
In [1902]:
np.array(line_losses).shape
Out[1902]:
(10, 20)
In [1922]:
rand_min_losses = np.array([np.min(l) for l in line_losses])
In [1924]:
plt.hist(rand_min_losses,bins=50,normed=True,log=True);
plt.vlines(best_solution,0,130.0,linestyle='--',color='grey',label='Best SGD solution')
plt.vlines(np.min(rand_min_losses),0,130.0,linestyle='--',color='blue',label='Best random direction')
plt.legend(loc='best')
plt.xlabel('Loss')
plt.ylabel('Probability density')
plt.title('Line search in random directions doesn\'t work')
Out[1924]:
<matplotlib.text.Text at 0x54b5d4610>
In [1416]:
scrambled_end_points = [end_points[0][sorted(range(dim),key=lambda i:npr.rand())] for _ in range(100)]
In [1418]:
np.min(line_losses)
Out[1418]:
0.91941408732429475
In [1423]:
end_points.shape
Out[1423]:
(100, 183)
In [1428]:
best_ind = np.argmin(np.array([loss(e) for e in end_points]))
best_ind
Out[1428]:
55
In [1429]:
loss(end_points[best_ind])
Out[1429]:
0.077140585108297591
In [1437]:
line_losses = np.array([[loss(interp(np.zeros(dim),scrambled_end_points[best_ind],a)) for a in alpha] for i in inds])
In [1454]:
plt.bar(range(len(end_points.T)),sorted(np.mean(end_points,0)));
In [1482]:
stats.ttest_1samp(end_points[:,i],0)[0]
Out[1482]:
2.954386291869936e-21
In [1486]:
from scipy import stats

sig = np.zeros(end_points.shape[1])
for i in range(end_points.shape[1]):
    #mean,(low,up) = stats.bayes_mvs(end_points[:,i],alpha=0.95)[0]
    ##sig[i] = (low*up>0)*abs(mean / (up-low))
    sig[i]=abs(stats.ttest_1samp(end_points[:,i],0)[0])
In [1487]:
plt.plot(sig)
Out[1487]:
[<matplotlib.lines.Line2D at 0x544526f50>]
In [1498]:
mats=vec_to_mats(sig,D,h,K)
In [1504]:
fig, axes = plt.subplots(nrows=2, ncols=2)
for i,ax in enumerate(axes.flat):
    im = ax.imshow(mats[i],cmap='Blues')

fig.subplots_adjust(right=0.8)
cbar_ax = fig.add_axes([0.85, 0.15, 0.05, 0.7])
fig.colorbar(im,cax=cbar_ax)#, cax=cbar_ax)
Out[1504]:
<matplotlib.colorbar.Colorbar instance at 0x4d05f6128>

Hmm so it look like the biases learn consistent values across multiple runs

In [1458]:
end_points[:,0].shape
Out[1458]:
(100,)
In [1444]:
plt.hist([l1(np.zeros(dim),e) for e in end_points],bins=50);
In [1506]:
plt.hist([l2(np.zeros(dim),e) for e in end_points],bins=100);
In [1616]:
aligned_end_points = heuristic_alignment(end_points,D,h,K)
In [1618]:
raw_distances = distance.pdist(end_points)
aligned_distances = distance.pdist(aligned_end_points)
In [1619]:
raw_distances.mean(),aligned_distances.mean()
Out[1619]:
(24.170295033505681, 20.79432166590934)
In [1620]:
raw_distances.min(),aligned_distances.min()
Out[1620]:
(8.6517482027073012, 4.7948656369942562)
In [1510]:
plt.hist([l2(np.zeros(dim),18*npr.randn(dim)/14) for _ in range(1000)],bins=100);
In [1438]:
plt.plot(line_losses[10])
Out[1438]:
[<matplotlib.lines.Line2D at 0x486ab3c50>]

compared with the angle from start to finish, what is the angle from start to current?

In [1511]:
start_to_f = [e - i for (e,i) in zip(end_points,start_points)]
In [1522]:
e = end_points[0]
e.dot(-e)/(np.sqrt(sum(e**2))*np.sqrt(sum(e**2)))
Out[1522]:
-1.0000000000000004
In [1576]:
angle = lambda x,y:x.dot(y)/(np.sqrt(sum(x**2))*np.sqrt(sum(y**2)))
In [1534]:
r = run_records[0]
In [1539]:
plt.plot(angle(start_to_f[0],r.T))
Out[1539]:
[<matplotlib.lines.Line2D at 0x4d0a91c10>]
In [1784]:
[plt.plot(angle(start_to_f[i],run_records[i].T),alpha=0.5) for i in range(100)];
plt.xlabel('Iteration #')
plt.ylabel('Cosine similarity w.r.t. final')
Out[1784]:
<matplotlib.text.Text at 0x53e47aa90>
In [1554]:
angle(end_points[40],end_points[41])
Out[1554]:
0.042579906747210766
In [1563]:
run_records[0].shape
Out[1563]:
(10000, 183)
In [1572]:
run_records[0][:,:-lag].dot(run_records[0][:,:-lag].T).shape
Out[1572]:
(10000, 10000)
In [1586]:
def lag_angle(record,lag=10):
    n = len(record) - lag
    angles = np.zeros(n)
    for i in range(n):
        angles[i] = angle(record[i],record[i+lag])
    return angles
In [1587]:
run_records[:10][0].shape
Out[1587]:
(10000, 183)
In [1783]:
lag=200
[plt.plot(lag_angle(r[:1200],lag),alpha=0.5) for r in run_records];
plt.xlabel('Iteration #')
plt.ylabel('Cosine similarity at lag {0}'.format(lag))
Out[1783]:
<matplotlib.text.Text at 0x50816af50>
In []:
In [1601]:
cosine_distances = distance.pdist(end_points,'cosine')-1
distance.squareform(cosine_distances).shape
Out[1601]:
(100, 100)
In [1602]:
np.mean(cosine_distances),np.min(cosine_distances),np.max(cosine_distances)
Out[1602]:
(-0.028611133199068644, -0.57503860316258859, 0.37360094254637466)
In []:
In [1600]:
distance.cosine(np.ones(200),np.ones(200))
Out[1600]:
1.1102230246251565e-16
In [1559]:
plt.imshow(distance.squareform(distance.pdist(r[::10],'cosine')),cmap='Blues')
Out[1559]:
<matplotlib.image.AxesImage at 0x50f13b650>

How sensitive is this result to the angle? i.e. \(\left|\frac{\partial \min J}{\partial \theta}\right|\)?

Now it's convenient to define a loss function in terms of just the direction, since the minimum can be found easily using a coarse line search

Line loss

In [1738]:
min_line_loss = lambda vec,loss=loss,alpha=np.arange(0.0,1.5,0.075):np.min([loss(interp(np.zeros(len(vec)),vec,a)) for a in alpha])
In [1648]:
loss_as_f_alpha = lambda vec,loss=loss:lambda alpha:loss(alpha*vec)
In []:
In [1656]:
from scipy import optimize
direction = end_points[best_ind]
loss_in_best_dir = loss_as_f_alpha(direction)
optimize.fmin(loss_as_f_alpha(direction),1.0)
Optimization terminated successfully.
         Current function value: 0.059242
         Iterations: 12
         Function evaluations: 24

Out[1656]:
array([ 1.16894531])
In [1674]:
t = time()
optimize.fmin(loss_in_best_dir,1.0,maxfun=30,disp=0)
print(time() - t)
0.0413529872894

In [1645]:
%timeit min_line_loss(npr.randn(dim))
100 loops, best of 3: 12.2 ms per loop

In [1646]:
min_line_loss(end_points[best_ind])
Out[1646]:
0.059259347625614918
In [1631]:
def gradient(func,x0,h=0.001):
    x0 = np.array(x0)#,dtype=float)
    y = func(x0)
    deriv = np.zeros(len(x0))
    for i in range(len(x0)):
        x = np.array(x0)
        x[i] += h
        deriv[i] = (func(x) - y)/h
    return deriv
In [1635]:
%timeit gradient(min_line_loss,end_points[best_ind])
1 loops, best of 3: 1.51 s per loop

In [1638]:
new_dir = end_points[best_ind] - gradient(min_line_loss,end_points[best_ind])
In [1675]:
min_line_loss_scipy = lambda vec,loss=loss:optimize.fmin(loss_as_f_alpha(vec),1.0,maxfun=30,disp=0)
In [1677]:
gradient(min_line_loss_scipy,end_points[best_ind])[:10]
Out[1677]:
array([ 0.        ,  0.09765625,  0.        ,  0.09765625,  0.09765625,
        0.        ,  0.        ,  0.        ,  0.09765625,  0.09765625])
In [1678]:
min_line_loss(new_dir)
Out[1678]:
0.043117361081621064
In [1687]:
npr.seed(0)


new_dir = npr.randn(dim)
directions = [new_dir]
losses = [min_line_loss(new_dir)]
for i in range(10):
    new_dir -= 100*gradient(min_line_loss,new_dir)
    directions.append(new_dir)
    losses.append(min_line_loss(new_dir))
    print(losses[-1])
1.09867824926
1.09863392837
1.09850089274
1.08799917902
1.07802230694
1.06780542019
1.05722111163
1.04630885214
1.03490924526
1.02280603572

In [1683]:
sum(abs(new_dir))
Out[1683]:
152.44633840588705
In [1840]:
def adaptive_metropolis_hastings(func,x0,num_steps=1000,step_size=0.1,verbose=False):
    locs = np.zeros((num_steps+1,len(x0)))
    vals = np.zeros(num_steps+1)
    locs[0] = x0
    vals[0] = func(x0)
    num_rejects=0
    num_accepts=0
    message = ''
    adaptation_rate=1.02
    steps = np.zeros(num_steps+1)
    steps[0] = step_size
    for i in range(num_steps):
        new_loc = locs[i] + npr.randn(len(locs[i]))*step_size
        new_val = func(new_loc)
        #accept = npr.rand() < np.exp(-(new_val - vals[i]))#/vals[i]#np.exp(-(new_val - vals[i]))
        accept = new_val <= vals[i]#*(1+npr.randn()*0.01)# or (npr.rand() < 0.05)
        if not accept:
            new_loc = locs[i]
            new_val = vals[i]
            num_rejects+=1
            num_accepts=0
            if num_rejects > 10:
                if step_size > 1e-3:
                    step_size/=adaptation_rate
                message = message+'\n reduced step size'
        else:
            num_rejects=0
            num_accepts+=1
            if num_accepts > 10:
                if step_size < 1.0:
                    step_size*=adaptation_rate
                message = message+ '\n increased step size'
        locs[i+1] = new_loc
        vals[i+1] = new_val
        if i % 100 ==0:
            print("{0}: {1:.3}".format(i,new_val))
            if verbose:
                print(message)
            message = ''
        steps[i+1] = step_size
    return locs,vals,steps
In [1841]:
np.arange(0.0,1.5,0.1).shape
Out[1841]:
(15,)
In [1842]:
def min_line_loss(vec,loss=loss,alpha=np.arange(0.0,1.5,0.1)):
    return np.min([loss(interp(np.zeros(len(vec)),vec,a)) for a in alpha])
In [1843]:
locs,vals,steps = adaptive_metropolis_hastings(min_line_loss,end_points[best_ind],num_steps=2000,step_size=0.01)
#+npr.randn(dim)*0.01
0: 0.0577
100: 0.0267
200: 0.0223
300: 0.0213
400: 0.0209
500: 0.0203
600: 0.0198
700: 0.0195
800: 0.0192
900: 0.0187
1000: 0.0183
1100: 0.0178
1200: 0.0174
1300: 0.0171
1400: 0.0169
1500: 0.0165
1600: 0.0163
1700: 0.0162
1800: 0.0161
1900: 0.0159

In [1865]:
plt.plot(vals)
plt.hlines(vals[0],0,len(vals),linestyle='--',color='grey')
plt.ylabel('loss')
plt.xlabel('Hill-climbing step')
Out[1865]:
<matplotlib.text.Text at 0x548bc0c10>
In [1845]:
plt.plot(steps)
Out[1845]:
[<matplotlib.lines.Line2D at 0x547476c10>]
In [1810]:
np.mean(abs(end_points[best_ind]))
Out[1810]:
0.80133638136063834
In [1811]:
np.mean(abs(npr.randn(dim)))
Out[1811]:
0.91569353934038389
In [1849]:
random_directions=npr.randn(100,dim)
In [1850]:
best_rand = np.argmin([min_line_loss(r) for r in random_directions])
In [1886]:
multiple_hc_runs_near = []
for i in range(10):
    locs,vals,steps = adaptive_metropolis_hastings(min_line_loss,end_points[best_ind]+npr.randn(dim)*0.5,num_steps=1000,step_size=0.01)
    multiple_hc_runs_near.append((locs,vals,steps))
#locs,vals,steps = adaptive_metropolis_hastings(min_line_loss,end_points[best_ind]+0.5*npr.randn(dim),num_steps=5000,step_size=0.01)
0: 0.69
100: 0.158
200: 0.0678
300: 0.0472
400: 0.0407
500: 0.0361
600: 0.0322
700: 0.0296
800: 0.0278
900: 0.027
0: 1.0
100: 0.327
200: 0.179
300: 0.111
400: 0.0713
500: 0.054
600: 0.0413
700: 0.0392
800: 0.0347
900: 0.031
0: 0.922
100: 0.565
200: 0.241
300: 0.102
400: 0.0694
500: 0.053
600: 0.0472
700: 0.0451
800: 0.0422
900: 0.0396
0: 0.852
100: 0.697
200: 0.508
300: 0.309
400: 0.164
500: 0.0883
600: 0.0545
700: 0.0423
800: 0.034
900: 0.0302
0: 0.72
100: 0.573
200: 0.448
300: 0.292
400: 0.144
500: 0.0597
600: 0.0444
700: 0.0357
800: 0.0316
900: 0.0284
0: 0.599
100: 0.273
200: 0.132
300: 0.0689
400: 0.0537
500: 0.0465
600: 0.0404
700: 0.0355
800: 0.0335
900: 0.0314
0: 0.992
100: 0.856
200: 0.639
300: 0.37
400: 0.203
500: 0.112
600: 0.086
700: 0.0741
800: 0.0683
900: 0.0639
0: 0.657
100: 0.441
200: 0.299
300: 0.193
400: 0.126
500: 0.08
600: 0.0526
700: 0.0413
800: 0.0323
900: 0.0285
0: 0.936
100: 0.817
200: 0.666
300: 0.5
400: 0.353
500: 0.212
600: 0.132
700: 0.0713
800: 0.0552
900: 0.0448
0: 0.508
100: 0.288
200: 0.161
300: 0.093
400: 0.0577
500: 0.0417
600: 0.0349
700: 0.0303
800: 0.0267
900: 0.0257

In [1879]:
# random initialization
for soln in multiple_hc_runs[:-1]:
    vals = soln[1]
    plt.plot((np.arange(len(vals))*15),vals)
vals = multiple_hc_runs[-1][1]
plt.plot((np.arange(len(vals))*15),vals,label='Hill-climbing + line search')
plt.hlines(best_solution,0,(len(vals))*15,linestyle='--',color='grey',label='Best SGD solution')
plt.vlines(10000,0,1,linestyle='--',color='grey')#'blue',label='Num fxn evaluations used in SGD search')
plt.xlabel('Loss function evaluations')
plt.ylabel('Loss')
plt.legend(loc='best')
plt.title('Random initialization')
Out[1879]:
<matplotlib.text.Text at 0x53e15ec10>
In [1887]:
for soln in multiple_hc_runs_near[:-1]:
    vals = soln[1]
    plt.plot((np.arange(len(vals))*15),vals)
vals = multiple_hc_runs_near[-1][1]
plt.plot((np.arange(len(vals))*15),vals,label='Hill-climbing + line search')
plt.hlines(best_solution,0,(len(vals))*15,linestyle='--',color='grey',label='Best SGD solution')
plt.vlines(10000,0,1,linestyle='--',color='grey')#'blue',label='Num fxn evaluations used in SGD search')
plt.xlabel('Loss function evaluations')
plt.ylabel('Loss')
plt.legend(loc='best')
plt.title('Initialized near SGD solution')
Out[1887]:
<matplotlib.text.Text at 0x5400cc610>
In [1890]:
hc_solns = np.array([run[0][-1] for run in multiple_hc_runs_near])
In [1892]:
hc_angles = []
for i in range(len(hc_solns)):
    for j in range(i):
        if i!=j:
            hc_angles.append(angle(hc_solns[i],hc_solns[j]))
hc_angles = np.array(hc_angles)
In [1893]:
plt.hist(hc_angles,bins=20)
In [1864]:
plt.plot(steps)
Out[1864]:
[<matplotlib.lines.Line2D at 0x548beffd0>]
In [1684]:
sum(abs(gradient(min_line_loss,new_dir)))
Out[1684]:
0.0061784101530459878
In [1634]:
plt.plot(sorted(gradient(min_line_loss,end_points[best_ind])))
Out[1634]:
[<matplotlib.lines.Line2D at 0x53cb146d0>]
In [1050]:
from scipy.spatial import distance
pdist = distance.pdist(run_record)
In [1051]:
dist_mat = distance.squareform(pdist)
In [1052]:
from matplotlib import colors
ln = colors.LogNorm()
In [1055]:
plt.imshow(dist_mat,interpolation='none',
           #norm=ln,
           cmap ='Blues')
#num_ticks = 4
#plt.xticks(np.arange(4)*num_iter/num_ticks)
plt.colorbar()
plt.title('Pairwise distances between parameter configurations during training')
plt.savefig('l2-pairwise-param-vec-distances-3h-3-class-normed.jpg',dpi=600)
In [959]:
def pairwise_distances(X):
    pdist=np.zeros((len(X),len(X)))
    for i in range(len(X)):
        for j in range(len(X)):
            #
            pbc[i,j] = BC_jit(X[i],X[j])
    return pbc
In [1000]:
W,b,W2,b2 = vec_to_mats(run_record[-1],D,h,K)

# plot the resulting classifier
h = 0.02
x_min, x_max = X[:, 0].min() - 0.1, X[:, 0].max() + 0.1
y_min, y_max = X[:, 1].min() - 0.1, X[:, 1].max() + 0.1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                     np.arange(y_min, y_max, h))
Z = np.dot(np.maximum(0, np.dot(np.c_[xx.ravel(), yy.ravel()], W) + b), W2) + b2
Z = np.argmax(Z, axis=1)
Z = Z.reshape(xx.shape)
fig = plt.figure()
plt.contourf(xx, yy, Z, cmap=plt.cm.Greys, alpha=0.8)
plt.scatter(X[:, 0], X[:, 1], c=y,linewidths=0)#, s=40)#, cmap=plt.cm.Greys)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
#fig.savefig('spiral_net.png')
Out[1000]:
(-1.0509244727707623, 1.0690755272292396)

Desiderata:

Permutation invariance

In [359]:
in_dim = 2
hidden_dim = 10
out_dim = 2

W1 = npr.randn(in_dim,hidden_dim)
W2 = npr.randn(hidden_dim,out_dim)
In [362]:
npr.randn(2).dot(W1)
Out[362]:
array([ 0.80883726, -1.70239477,  1.81801994, -2.51101393,  1.85871958,
       -1.2538666 ,  3.14945952, -0.33905906,  3.24302743,  0.06636554])
In [363]:
from scipy.misc import comb
In [369]:
np.prod(range(1,20))
Out[369]:
121645100408832000
In [374]:
np.prod(W1.shape) + np.prod(W2.shape)
Out[374]:
40
In [375]:
parameter_vector = np.hstack((W1.reshape(np.prod(W1.shape)),W2.reshape(np.prod(W2.shape))))
In [398]:
def mats_to_vec(W,b,W2,b2):
    
    return np.hstack((np.reshape(W,W.shape[0]*W.shape[1]),np.reshape(b,b.shape[0]*b.shape[1]),
                      np.reshape(W2,W2.shape[0]*W2.shape[1]),np.reshape(b2,b2.shape[0]*b2.shape[1])))
In [412]:
def vec_to_mats(param_vector,D,h,K):
    ''' D is input dimensionality, h is number of hidden units, K is number of output units '''
    
    index = 0
    W = param_vector[index:index + D*h].reshape((D,h))
    index += D*h
    b = param_vector[index:index+h].reshape((1,h))
    index += h
    W2 = param_vector[index:index+h*K].reshape((h,K))
    index+= h*K
    b2 = param_vector[index:index +K].reshape((1,K))    
    return W,b,W2,b2
In [770]:
mats = vec_to_mats(run_record[0],D,h,K)
W1,b1,W2,b2 = mats
In [771]:
W1.shape,b1.shape,W2.shape,b2.shape
Out[771]:
((2, 0), (1, 0), (0, 3), (1, 3))
In [533]:
def find_permutation(mats):
    ''' simple heuristic, permute according to the column-wise mean values of W1'''
    #return range(W1.shape[1])
    W1,b1,W2,b2 = mats
    #return sorted(range(W1.shape[1]),key=lambda i:b1[0,i])
    #return sorted(range(W1.shape[1]),key=lambda i:np.max(W1[:,i])+np.max(W2[i,:]))
    #return sorted(range(W1.shape[1]),key=lambda i:np.max(W2[i,:]))
    return sorted(range(W1.shape[1]),key=lambda i:np.max(W1[:,i]))
In [534]:
perm = find_permutation(mats)
In [535]:
def permuted_vector(perm,W1,b1,W2,b2):
    D,h = W1.shape
    K = b2.shape[1]
    #perm = find_permutation(W1)
    return mats_to_vec(W1[:,perm],b1[:,perm],W2[perm,:],b2)
In [536]:
def heuristic_alignment(record,D,h,K):
    perm_record = np.zeros(record.shape)
    for i in range(len(record)):
        mats = vec_to_mats(record[i],D,h,K)
        perm_record[i] = permuted_vector(find_permutation(mats),*mats)
    return perm_record
In [1056]:
from sklearn.decomposition import PCA
pca = PCA()
pca.fit(run_record)

plt.plot(pca.explained_variance_ratio_[:20])
Out[1056]:
[<matplotlib.lines.Line2D at 0x3e627eed0>]
In [1093]:
plt.plot(pca.transform(run_record)[:,0],label='PC1',color='blue',
         linewidth=3)
plt.plot(pca.transform(run_record)[:,1],label='PC2',color='blue',
         linewidth=2,alpha=0.7)
plt.plot(pca.transform(run_record)[:,2],label='PC3',color='blue',
         alpha=0.5)
plt.plot(pca.transform(run_record)[:,3],label='PC4',color='blue',
         linestyle='--',alpha=0.5)
plt.plot(pca.transform(run_record)[:,4],label='PC5',color='blue',
         linestyle='--',alpha=0.25)
plt.plot(pca.transform(run_record)[:,5],label='PC6+',color='grey',
         linestyle='--',alpha=0.1)
for i in range(6,10):
    plt.plot(pca.transform(run_record)[:,i],color='grey',
             linestyle='--',alpha=0.1)
plt.xlabel('Iteration #')
plt.legend(loc='best')
plt.title('Training dynamics projected onto leading principal components')
Out[1093]:
<matplotlib.text.Text at 0x46eb24810>
In []:
In []:
In [1086]:
reduced_dynamics = pca.transform(run_record)[:,:10]
In [973]:
sum(pca.explained_variance_ratio_[:3])
Out[973]:
0.96540130610302688
In [974]:
run_record.shape
Out[974]:
(10000, 183)
In [975]:
plt.plot(abs(pca.components_[1]))
Out[975]:
[<matplotlib.lines.Line2D at 0x183078d50>]
In [976]:
plt.plot(sorted(abs(pca.components_[0])))
Out[976]:
[<matplotlib.lines.Line2D at 0x182131cd0>]
In [977]:
sum(abs(pca.components_[0])>0.05)
Out[977]:
54
In [978]:
plt.scatter(pca.transform(run_record)[:,0],pca.transform(run_record)[:,1],
            c=range(num_iter),linewidths=0,alpha=0.5)
plt.colorbar()
Out[978]:
<matplotlib.colorbar.Colorbar instance at 0x163691fc8>
In [979]:
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

ax.scatter(pca.transform(run_record)[:,0],pca.transform(run_record)[:,1],pca.transform(run_record)[:,2],
            c=range(num_iter),linewidths=0,alpha=0.5)
#plt.colorbar()
Out[979]:
<mpl_toolkits.mplot3d.art3d.Path3DCollection at 0x2e46cccd0>
In [989]:
D=2
In [990]:
aligned_record = heuristic_alignment(run_record,D,h,K)
In [1009]:
from scipy.spatial import distance
pdist = distance.pdist(aligned_record)
In [1010]:
dist_mat = distance.squareform(pdist)
In [1011]:
plt.imshow(dist_mat,interpolation='none',
           #norm=ln,
           cmap ='Blues')
#num_ticks = 4
#plt.xticks(np.arange(4)*num_iter/num_ticks)
plt.colorbar()
plt.title('Pairwise distances between parameter configurations during training')
plt.savefig('l2-pairwise-param-vec-distances-sorted-3h-3-class.jpg',dpi=600)
In [1020]:
h=30
In [1021]:
perms = [find_permutation(vec_to_mats(vec,D,h,K)) for vec in run_record]
In []:
def find_most_common_perm(record, D,h,K):
    perms = [find_permutation(vec_to_mats(vec,D,h,K)) for vec in record]
    
In [1022]:
perms = np.array(perms)
In [1023]:
len([p for p in perms])
Out[1023]:
10000
In [1024]:
len(set([tuple(p) for p in perms]))
Out[1024]:
1587
In [1025]:
plt.plot(sorted(np.std(perms,0)))
Out[1025]:
[<matplotlib.lines.Line2D at 0x373990b90>]
In [1012]:
from sklearn.decomposition import PCA
pca = PCA()
pca.fit(aligned_record)

plt.plot(pca.explained_variance_ratio_[:20])
Out[1012]:
[<matplotlib.lines.Line2D at 0x372bdfd50>]
In [1013]:
sum(pca.explained_variance_ratio_[:2])
Out[1013]:
0.57612004279971307
In [1014]:
plt.scatter(pca.transform(aligned_record)[:,0],pca.transform(aligned_record)[:,1],
            c=range(num_iter),linewidths=0,alpha=0.5)
plt.colorbar()
Out[1014]:
<matplotlib.colorbar.Colorbar instance at 0x373962200>
In [1045]:
pca.fit(globally_aligned_record)
Out[1045]:
PCA(copy=True, n_components=None, whiten=False)
In [1047]:
sum(pca.explained_variance_ratio_[:3])
Out[1047]:
0.96540130610302688
In [1048]:
plt.scatter(pca.transform(globally_aligned_record)[:,0],pca.transform(globally_aligned_record)[:,1],
            c=range(num_iter),linewidths=0,alpha=0.5)
plt.colorbar()
Out[1048]:
<matplotlib.colorbar.Colorbar instance at 0x162c0bcf8>
In [1026]:
perm_set_list = list(set([tuple(p) for p in perms]))
In [1027]:
mapping = dict(zip(list(set([tuple(p) for p in perms])),range(len(perm_set_list))))
In [1028]:
mapped = np.array([mapping[tuple(p)] for p in perms])
In [1030]:
res = plt.hist(mapped,bins=83);
In [1035]:
most_freq_perm = perms[mapped==np.argmax(res[0])][0]
In [665]:
most_freq_perm = perms[mapped==66][0]
most_freq_perm
Out[665]:
array([9, 0, 5, 1, 6, 8, 4, 2, 3, 7])
In [1036]:
globally_aligned_record = np.zeros(run_record.shape)
for i in range(len(run_record)):
    mats = vec_to_mats(run_record[i],D,h,K)
    globally_aligned_record[i] = permuted_vector(most_freq_perm,*mats)
In [1041]:
from scipy.spatial import distance
pdist = distance.pdist(globally_aligned_record)#[::10])
In [1042]:
dist_mat = distance.squareform(pdist)
In [1049]:
plt.imshow(dist_mat,interpolation='none',
           #norm=ln,
           cmap ='Blues')
#num_ticks = 4
#plt.xticks(np.arange(4)*num_iter/num_ticks)
plt.colorbar()
plt.title('Pairwise distances between parameter configurations during training')
plt.savefig('l2-pairwise-param-vec-distances-global-sorted-30h.jpg',dpi=600)
In [657]:
plt.plot(mapped==66)
Out[657]:
[<matplotlib.lines.Line2D at 0x154463050>]
In [641]:
best_index = np.argmin(loss_record)
distance_from_best = np.array([norm(aligned_record[i],aligned_record[best_index]) for i in xrange(len(run_record))])
best_index
Out[641]:
7460
In [642]:
plt.plot(distance_from_best)
plt.vlines(best_index,0,max(distance_from_best),linestyle='--')

plt.title('Adjusted distance from best parameter configuration')
plt.xlabel('Iteration')
plt.ylabel('Euclidean distance')
Out[642]:
<matplotlib.text.Text at 0x153baba90>
In []:
W = 0.01 * np.random.randn(D,h)
b = np.zeros((1,h))
W2 = 0.01 * np.random.randn(h,K)
b2 = np.zeros((1,K))
In []:
X = 
In []:
# can we reduce this to a sorting problem?
In [376]:
sorted(range(len(W2)),key=lambda i:np.mean(W1[:,i]))
Out[376]:
[7, 8, 4, 5, 6, 0, 9, 2, 1, 3]
In [396]:
W1.shape
Out[396]:
(2, 10)
In []:
# greedy alignment
In []:
In []:
# let's try this for autoencoders!

# initialize parameters randomly
h = 100 # size of hidden layer
W = 0.01 * np.random.randn(D,h)
b = np.zeros((1,h))
W2 = 0.01 * np.random.randn(h,D)
b2 = np.zeros((1,D))

# some hyperparameters
step_size = 1e-0
reg = 1e-3 # regularization strength


# keep a record of training progress
params = np.hstack((np.reshape(W,W.shape[0]*W.shape[1]),np.reshape(b,b.shape[0]*b.shape[1]),
                    np.reshape(W2,W2.shape[0]*W2.shape[1]),np.reshape(b2,b2.shape[0]*b2.shape[1])))
num_params = len(params)
num_iter=10000
run_record = np.zeros((num_iter,num_params))
loss_record = np.zeros(num_iter)

# stochastic gradient descent loop
#mini_batch_size = len(X)
mini_batch_size=20
for i in xrange(num_iter):
  # minibatch
  if mini_batch_size < len(X):
    mask = npr.rand(len(X)) < (1.0*mini_batch_size / len(X))
    X_ = X[mask]
    y_ = y[mask]
  else:
    X_ = X[:]
    y_ = y[:]
  num_examples = len(X)#X_.shape[0]
  # evaluate class scores, [N x K]
  #hidden_layer = tanh(np.dot(X, W) + b)
  #hidden_layer = sigmoid(np.dot(X, W) + b)
  hidden_layer = np.maximum(0, np.dot(X, W) + b) # note, ReLU activation
  outputs = np.dot(hidden_layer, W2) + b2
  
  
  # compute the loss: average cross-entropy loss and regularization
  
  # compute full loss:
  compute_full_loss=True
  if mini_batch_size == len(X) or compute_full_loss:
    
    #corect_logprobs = -np.log(probs[range(len(X)),y])
    data_loss = np.sqrt(np.sum(abs(outputs - X)**2))/num_examples
    reg_loss = 0.5*reg*np.sum(W*W) + 0.5*reg*np.sum(W2*W2)
    loss = data_loss + reg_loss
    if i % 1000 == 0:
      print "iteration %d: loss %f" % (i, loss)
    loss_record[i] = loss
    # compute the gradient on scores
    dscores = probs
    dscores[range(num_examples),y] -= 1
    dscores /= num_examples
  
  # compute minibatch loss for gradient purposes:
  if mini_batch_size < len(X):
    num_examples = X_.shape[0]
    hidden_layer = np.maximum(0, np.dot(X_, W) + b) # note, ReLU activation
    scores = np.dot(hidden_layer, W2) + b2
    
    # compute the class probabilities
    exp_scores = np.exp(scores)
    probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True) # [N x K]
    corect_logprobs = -np.log(probs[range(num_examples),y_])
    data_loss = np.sum(corect_logprobs)/num_examples
    reg_loss = 0.5*reg*np.sum(W*W) + 0.5*reg*np.sum(W2*W2)
    loss = data_loss + reg_loss
    # compute the gradient on scores
    dscores = probs
    dscores[range(num_examples),y_] -= 1
    dscores /= num_examples
  
  # backpropate the gradient to the parameters
  # first backprop into parameters W2 and b2
  dW2 = np.dot(hidden_layer.T, dscores)
  db2 = np.sum(dscores, axis=0, keepdims=True)
  # next backprop into hidden layer
  dhidden = np.dot(dscores, W2.T)
  # backprop the ReLU non-linearity
  dhidden[hidden_layer <= 0] = 0
  # finally into W,b
  dW = np.dot(X_.T, dhidden)
  db = np.sum(dhidden, axis=0, keepdims=True)
  
  # add regularization gradient contribution
  dW2 += reg * W2
  dW += reg * W
  
  # perform a parameter update
  W += -step_size * dW
  b += -step_size * db
  W2 += -step_size * dW2
  b2 += -step_size * db2
    
  run_record[i] = np.hstack((np.reshape(W,W.shape[0]*W.shape[1]),np.reshape(b,b.shape[0]*b.shape[1]),
                               np.reshape(W2,W2.shape[0]*W2.shape[1]),np.reshape(b2,b2.shape[0]*b2.shape[1])))